library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 1.0.1
## ✔ tibble 3.1.8 ✔ dplyr 1.1.0
## ✔ tidyr 1.3.0 ✔ stringr 1.5.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
df <- readr::read_csv("paint_project_train_data.csv", col_names = TRUE)
## Rows: 835 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Lightness, Saturation
## dbl (6): R, G, B, Hue, response, outcome
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df %>% glimpse()
## Rows: 835
## Columns: 8
## $ R <dbl> 172, 26, 172, 28, 170, 175, 90, 194, 171, 122, 0, 88, 144, …
## $ G <dbl> 58, 88, 94, 87, 66, 89, 78, 106, 68, 151, 121, 140, 82, 163…
## $ B <dbl> 62, 151, 58, 152, 58, 65, 136, 53, 107, 59, 88, 58, 132, 50…
## $ Lightness <chr> "dark", "dark", "dark", "dark", "dark", "dark", "dark", "da…
## $ Saturation <chr> "bright", "bright", "bright", "bright", "bright", "bright",…
## $ Hue <dbl> 4, 31, 8, 32, 5, 6, 34, 10, 1, 21, 24, 22, 36, 16, 26, 12, …
## $ response <dbl> 12, 10, 16, 10, 11, 16, 10, 19, 14, 25, 14, 19, 14, 38, 15,…
## $ outcome <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,…
df <- df %>% mutate(outcome_event = ifelse(df$outcome == 1, 'event','non_event'))
df %>%
count(outcome_event) %>%
ggplot(mapping = aes(x = outcome_event, y = n)) +
geom_bar(stat = "identity", fill = "steelblue",alpha = 0.8)
For categorical variables, the number of events is not balanced with the
number of non_events. non_events are significantly more numerous.
Histograms or Density plots for continuous variables. Are the distributions Gaussian like?
data.frame(x = df$R) %>%
ggplot(aes(x)) +
geom_density(fill = "steelblue", alpha = 0.5) +
labs(title = "Density Plot of Variable R", x = "Value of R", y = "Density")
df_bin_R <-df %>% mutate(R_bin = cut(R,
breaks = seq(0,300, by = 20),
include.lowest = TRUE))
df_bin_R %>% ggplot() +
geom_bar(mapping = aes(x = R_bin),fill = "steelblue")
data.frame(x = df$G) %>%
ggplot(aes(x)) +
geom_density(fill = "steelblue", alpha = 0.5) +
labs(title = "Density Plot of Variable G", x = "Value of G", y = "Density")
df_bin_G <-df %>% mutate(G_bin = cut(G,
breaks = seq(0,300, by = 20),
include.lowest = TRUE))
df_bin_G %>% ggplot() +
geom_bar(mapping = aes(x = G_bin),fill = "steelblue")
data.frame(x = df$B) %>%
ggplot(aes(x)) +
geom_density(fill = "steelblue", alpha = 0.5) +
labs(title = "Density Plot of Variable B", x = "Value of B", y = "Density")
df_bin_B <-df %>% mutate(B_bin = cut(B,
breaks = seq(0,300, by = 20),
include.lowest = TRUE))
df_bin_B %>% ggplot() +
geom_bar(mapping = aes(x = B_bin),fill = "steelblue")
data.frame(x = df$Hue) %>%
ggplot(aes(x)) +
geom_density(fill = "steelblue", alpha = 0.5) +
labs(title = "Density Plot of Variable Hue", x = "Value of Hue", y = "Density")
df_bin_Hue <-df %>% mutate(Hue_bin = cut(Hue,
breaks = seq(0,40, by = 5),
include.lowest = TRUE))
df_bin_Hue %>% ggplot() +
geom_bar(mapping = aes(x = Hue_bin),fill = "steelblue")
R,G and B continuous variables’s images look like the distributions Gaussian, but Hue’s imagre looks like not.
Are there differences in continuous variable distributions and continuous variable summary statistics based on categorical variable values?
df %>% ggplot(aes(x = R, y = response,color = Lightness)) +
geom_line(size = 1.2) +
facet_wrap(~Saturation)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
df %>% ggplot(aes(x = G, y = response,color = Lightness)) +
geom_line(size = 1.2) +
facet_wrap(~Saturation)
df %>% ggplot(aes(x = B, y = response,color = Lightness)) +
geom_line(size = 1.2) +
facet_wrap(~Saturation)
df %>% ggplot(aes(x = Hue, y = response,color = Lightness)) +
geom_line(size = 1.2) +
facet_wrap(~Saturation)
df %>% ggplot(aes(x = R + G + B + Hue, y = response,color = Lightness)) +
geom_line(size = 1.2) +
facet_wrap(~Saturation)
Overall the trend is the same. However, variable B has a slightly different trend in the case of BRIGHT classification and PURE classification.
df_logit <- df %>%
mutate(y = boot::logit( (response - 0) / (100 - 0) ) ) %>%
subset(select = c(R, G, B,
Lightness, Saturation, Hue,response,
y))
df_logit %>% glimpse()
## Rows: 835
## Columns: 8
## $ R <dbl> 172, 26, 172, 28, 170, 175, 90, 194, 171, 122, 0, 88, 144, …
## $ G <dbl> 58, 88, 94, 87, 66, 89, 78, 106, 68, 151, 121, 140, 82, 163…
## $ B <dbl> 62, 151, 58, 152, 58, 65, 136, 53, 107, 59, 88, 58, 132, 50…
## $ Lightness <chr> "dark", "dark", "dark", "dark", "dark", "dark", "dark", "da…
## $ Saturation <chr> "bright", "bright", "bright", "bright", "bright", "bright",…
## $ Hue <dbl> 4, 31, 8, 32, 5, 6, 34, 10, 1, 21, 24, 22, 36, 16, 26, 12, …
## $ response <dbl> 12, 10, 16, 10, 11, 16, 10, 19, 14, 25, 14, 19, 14, 38, 15,…
## $ y <dbl> -1.9924302, -2.1972246, -1.6582281, -2.1972246, -2.0907411,…
Can you identify any clear trends? Do the trends depend on the categorical INPUTS?
df_logit %>%
ggplot(mapping = aes(x = R, y = y, color = Lightness))+
geom_smooth(method = 'loess', formula = y ~ x, size = 0.4)+
facet_wrap(~Saturation, scales = "free")+
theme_bw()
df_logit %>%
ggplot(mapping = aes(x = G, y = y, color = Lightness))+
geom_smooth(method = 'loess', formula = y ~ x, size = 0.4)+
facet_wrap(~Saturation, scales = "free")+
theme_bw()
df_logit %>%
ggplot(mapping = aes(x = B, y = y, color = Lightness))+
geom_smooth(method = 'loess', formula = y ~ x, size = 0.4)+
facet_wrap(~Saturation, scales = "free")+
theme_bw()
df_logit %>%
ggplot(mapping = aes(x = Hue, y = y, color = Lightness))+
geom_smooth(method = 'loess', formula = y ~ x, size = 0.4)+
facet_wrap(~Saturation, scales = "free")+
theme_bw()
df %>% pivot_longer(c(R,G,B,Hue)) %>%
ggplot(mapping = aes(x = value, y = response)) +
geom_point(aes(col = Lightness)) +
geom_smooth(aes(fill = Lightness , col = Lightness), formula = y ~ x,method = lm) +
facet_wrap(~factor(name,levels = c("R","G","B","Hue")), scales = "free") +
theme_bw()
Yes,I think we can observe a clear upward trend in R,G and B continuous variables, both among the classifications. But in Hue,
df %>%
ggplot(mapping = aes(x = R, y = outcome))+
geom_point(mapping = aes(color = outcome),size = 0.1)+
facet_wrap(~Saturation, scales = "free")
df %>%
ggplot(mapping = aes(x = G, y = outcome))+
geom_point(mapping = aes(color = outcome),size = 0.1)+
facet_wrap(~Saturation, scales = "free")
df %>%
ggplot(mapping = aes(x = B, y = outcome))+
geom_point(mapping = aes(color = outcome),size = 0.1)+
facet_wrap(~Saturation, scales = "free")
df %>%
ggplot(mapping = aes(x = Saturation)) +
geom_bar(aes(fill = as.factor(outcome)),position = 'fill') +
scale_fill_brewer(name = 'outcome') +
theme_bw()
df %>%
ggplot(mapping = aes(x = Lightness)) +
geom_bar(aes(fill = as.factor(outcome)),position = 'fill') +
scale_fill_brewer(name = 'outcome') +
theme_bw()